library(dplyr)
library(tidyr)
library(rmatio)
library(stringr)

##########################################################################
#### Output source men and women data with education levels - whites 140 and blacks 345 ####
##########################################################################

rm(list = ls())

#
create_source <- function(cohort, no_education_groups){
  
  cohortwomen <- cohort + 1
  
  #Whites
  load(file = paste0("output_data/probsnonpar140_", no_education_groups, "educgrp.RData"))
  
  mendata <- mendata %>% mutate(YearBornHusband = (AgeHusband != 0)*(YearSurvey - AgeHusband)) %>% mutate(YearBornWife = (AgeWife != 0)*(YearSurvey - AgeWife))
  womendata <- womendata %>% mutate(YearBornHusband = (AgeHusband != 0)*(YearSurvey - AgeHusband)) %>% mutate(YearBornWife = (AgeWife != 0)*(YearSurvey - AgeWife))
  
  men140 <- mendata %>% filter(RaceHusband == 1 | RaceWife == 1) %>% filter((YearBornHusband == cohort | YearBornHusband == 0)) %>% select(HHweight, EducHusband, EducWife)
  women140 <- womendata %>% filter(RaceHusband == 1 | RaceWife == 1) %>% filter((YearBornWife == cohort + 1 | YearBornWife == 0)) %>%  select(HHweight, EducWife, EducHusband)
  
  #Blacks
  load(file = paste0("output_data/probsnonpar345_", no_education_groups, "educgrp.RData"))
  
  mendata <- mendata %>% mutate(YearBornHusband = (AgeHusband != 0)*(YearSurvey - AgeHusband)) %>% mutate(YearBornWife = (AgeWife != 0)*(YearSurvey - AgeWife))
  womendata <- womendata %>% mutate(YearBornHusband = (AgeHusband != 0)*(YearSurvey - AgeHusband)) %>% mutate(YearBornWife = (AgeWife != 0)*(YearSurvey - AgeWife))
  
  men345 <- mendata %>% filter(RaceHusband == 3 | RaceWife == 3) %>% filter((YearBornHusband == cohort | YearBornHusband == 0)) %>% select(HHweight, EducHusband, EducWife)
  women345 <- womendata %>% filter(RaceHusband == 3 | RaceWife == 3) %>% filter((YearBornWife == cohort + 1 | YearBornWife == 0)) %>% select(HHweight, EducWife, EducHusband)
  
  men140 <- as.matrix(men140)
  women140 <- as.matrix(women140)
  men345 <- as.matrix(men345)
  women345 <- as.matrix(women345)
  
  output <- list(men140, women140, men345, women345)
  names(output) <- paste0(c("men", "women", "men", "women"), c(cohort, cohortwomen, cohort, cohortwomen), "_", c("140", "140", "345", "345"))
  return(output)
}
#



# ## checks with earlier verion - checks OK on Aug 2, 2020
# test <- create_source(1940, 2)
# lapply(test, dim)
# lapply(test, slice, 1:5)
# test$women1941_140 %>% slice(10:15)
# 
# test <- create_source(1967, 2)
# lapply(test, dim)
# lapply(test, slice, 1:5)

# ## Sanity check for 5 educ levels - OK on Aug 2, 2020
#test <- create_source(1940, 5)
#lapply(test, dim)
#lapply(test, slice, 1:5)


output <- lapply(1940:1967, create_source, no_education_groups = 5)
output <- unlist(output, recursive = FALSE)

write.mat(object = output, filename="matlab_data/source_allyears_data.mat")



##########################################################################
### Output full couple sample - whites 140 and blacks 345 - all years ####
##########################################################################
rm(list = ls())

finalsample <- read.csv(file = "output_data/finalsample_csw.csv")

createfull <- function(cohort,  no_education_groups){
  temp <- finalsample
  if (no_education_groups == 2){
    finalsample$EducHusband[finalsample$EducHusband %in% c(1,2,3)] <- 2
    finalsample$EducHusband[finalsample$EducHusband %in% c(4,5)] <- 1
    finalsample$EducWife[finalsample$EducWife %in% c(1,2,3)] <- 2
    finalsample$EducWife[finalsample$EducWife%in% c(4,5)] <- 1
  }
  temp <- temp %>% mutate(YearBornHusband = (AgeHusband != 0)*(YearSurvey - AgeHusband))
  temp <- temp %>% mutate(YearBornWife = (AgeWife != 0)*(YearSurvey - AgeWife))
  
  full140 <- temp %>% filter(RaceHusband == 1 | RaceWife == 1) %>% filter((YearBornHusband == cohort | YearBornHusband == 0) & (YearBornWife == cohort + 1 | YearBornWife == 0))
  full345 <- temp %>% filter(RaceHusband == 3 | RaceWife == 3) %>% filter((YearBornHusband == cohort | YearBornHusband == 0) & (YearBornWife == cohort + 1 | YearBornWife == 0))
  
  full140 <- as.matrix(full140)
  full345 <- as.matrix(full345)
  output <- list(full140, full345)
  names(output) <- paste0(c("full", "full"), cohort, c("_140", "_345"))
  return(output)
}

# #check consistency with earlier version - OK on Aug 2, 2020
# test <- createfull(1940, 2)
# lapply(test, dim) 
# test <- createfull(1967, 2)
# lapply(test, dim) 
# test <- createfull(1962, 2)
# lapply(test, dim)  


output <- lapply(1940:1967, createfull, no_education_groups = 5)
output <- unlist(output, recursive = FALSE)

write.mat(object = output, filename="matlab_data/full_allyears_data.mat")


##########################################################################
#### Output proba and conditional proba matching data  - whites 140 and blacks 345 ####
##########################################################################
# 
rm(list = ls())
#
create_probs <- function(cohort, no_education_groups){
  
  cohortwomen <- cohort + 1
  education_groups <- matrix(1:no_education_groups,nrow = no_education_groups, ncol = 1)
  
  ## Whites
  load(file = paste0("output_data/probsnonpar140_", no_education_groups, "educgrp.RData"))
  mendata <- mendata %>% mutate(YearBornHusband = (AgeHusband != 0)*(YearSurvey - AgeHusband))
  womendata <- womendata %>% mutate(YearBornWife = (AgeWife != 0)*(YearSurvey - AgeWife))
  
  condprobsmen_140 <- probsmen[, cohort - begcohmen + 1,]
  probsmen_140 <- matrix(apply(X = education_groups, MARGIN = 1, FUN = function(educ){(mendata %>% filter(YearBornHusband == cohort, EducHusband == educ) %>% summarize(sum(HHweight)) %>% unlist)/(mendata %>% filter(YearBornHusband == cohort) %>% summarize(sum(HHweight)) %>% unlist)}), nrow = no_education_groups, ncol = 1)
  
  condprobswomen_140 <- probswomen[, cohortwomen - begcohwomen + 1,]
  probswomen_140 <- matrix(apply(X = education_groups, MARGIN = 1, FUN = function(educ){(womendata %>% filter(YearBornWife == cohortwomen, EducWife == educ) %>% summarize(sum(HHweight)) %>% unlist)/(womendata %>% filter(YearBornWife == cohortwomen) %>% summarize(sum(HHweight)) %>% unlist)}), nrow = no_education_groups, ncol = 1)
  
  # Blacks
  load(file = paste0("output_data/probsnonpar345_", no_education_groups, "educgrp.RData"))
  mendata <- mendata %>% mutate(YearBornHusband = (AgeHusband != 0)*(YearSurvey - AgeHusband))
  womendata <- womendata %>% mutate(YearBornWife = (AgeWife != 0)*(YearSurvey - AgeWife))
  
  condprobsmen_345 <- probsmen[, cohort - begcohmen + 1,]
  probsmen_345 <- matrix(apply(X = education_groups, MARGIN = 1, FUN = function(educ){(mendata %>% filter(YearBornHusband == cohort, EducHusband == educ) %>% summarize(sum(HHweight)) %>% unlist)/(mendata %>% filter(YearBornHusband == cohort) %>% summarize(sum(HHweight)) %>% unlist)}), nrow = no_education_groups, ncol = 1)
  
  condprobswomen_345 <- probswomen[, cohortwomen - begcohwomen + 1,]
  probswomen_345 <- matrix(apply(X = education_groups, MARGIN = 1, FUN = function(educ){(womendata %>% filter(YearBornWife == cohortwomen, EducWife == educ) %>% summarize(sum(HHweight)) %>% unlist)/(womendata %>% filter(YearBornWife == cohortwomen) %>% summarize(sum(HHweight)) %>% unlist)}), nrow = no_education_groups, ncol = 1)
  output <- list(condprobsmen_140, probsmen_140, condprobswomen_140, probswomen_140, condprobsmen_345, probsmen_345, condprobswomen_345, probswomen_345)
  names(output) <- paste0(c("condprobsmen", "probsmen", "condprobswomen", "probswomen", "condprobsmen", "probsmen", "condprobswomen", "probswomen"), c(cohort, cohort, cohortwomen, cohortwomen, cohort, cohort, cohortwomen, cohortwomen),
                          c("_140", "_140", "_140", "_140", "_345", "_345", "_345", "_345"))
  return(output)
}

output <- lapply(1940:1967, create_probs, no_education_groups = 5)
output <- unlist(output, recursive = FALSE)

write.mat(object = output, filename="matlab_data/probs_allyears_data.mat")

## check for probas - OK
#lapply(output, rowSums)[str_detect(names(output), "cond")] %>% unlist %>% unique # should be only equal to 1 
#lapply(output, sum)[!str_detect(names(output), "cond")] %>% unlist %>% unique # should be only equal to 1

# 
# ## Checks of the details in condprobs and probs - OK on 3, Aug 2020
# load(file = paste0("output_data/probsnonpar140_", 5, "educgrp.RData"))
# mendata %>% select(EducHusband) %>% unique
# probsmen[,1940 - begcohmen + 1,]
# mendata <- mendata %>% mutate(YearBornHusband = (AgeHusband != 0)*(YearSurvey - AgeHusband))
# mendata %>% filter(YearBornHusband == 1940) %>% group_by(EducHusband) %>% summarize(n = sum(HHweight)) %>% ungroup %>% mutate(n = n/sum(n))
# mendata %>% filter(YearBornHusband == 1940) %>% group_by(EducHusband) %>% summarize(n = sum(HHweight), 
#                                                                                     n_nevermarried = sum(HHweight*(EducWife == 0))/sum(HHweight),
#                                                                                     n_1 = sum(HHweight*(EducWife == 1))/sum(HHweight),
#                                                                                     n_2 = sum(HHweight*(EducWife == 2))/sum(HHweight))
